
pacman::p_load(tidyverse,
               fst,
               tidymodels,
               doParallel,
               tictoc,
               logr
)

v_lasso_vars <- read_csv(paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, 
                                "/lasso_selected_vars_", TRAINING_SUFFIX, "_",
                                RAND_NO_CID_SMALLEST_LARGEST, "/", 
                                "df_lasso_vars_and_dict.csv")) %>% 
  pull(Variable)

df_training_raw <- read_fst(paste0("../../data/pipeline_outputs/", 
                                   SPECIAL_SUFFIX, "/", "train_",
                                   TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "/",
                                   "training.fst"), 
                            columns = c("cid", "qtr", "t_default", 
                                        "rand_no_cid", all_of(v_lasso_vars)))

df_training <- df_training_raw %>% 
  mutate(
    year = year(qtr),
    year_outcome = (10 * year) + t_default,
    t_default = factor(t_default, levels = c(0, 1))
  ) 


# Convert categorical to dummy --------------------------------------------

dummies_formula <- df_training %>% 
  select(cid, qtr, year_outcome, rand_no_cid, contains("attr")) %>% 
  names() %>% 
  reformulate(termlabels = ., response = "t_default")

recipe_create_dummies <- recipe(formula = dummies_formula, data = df_training) %>% 
  update_role(cid, qtr, rand_no_cid, new_role = "id") %>% 
  update_role(year_outcome, new_role = "cv_strata") %>% 
  step_novel(all_nominal_predictors()) %>% 
  step_dummy(all_nominal_predictors(), one_hot = FALSE) %>%
  step_intercept() %>% 
  prep(., strings_as_factors = FALSE) 

saveRDS(recipe_create_dummies, paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "final_model_xgb_",
                                      TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC, "/",
                                      "dummy_recipe.rds"))


df_training_baked <- bake(recipe_create_dummies, df_training)

xgb_formula <- df_training_baked %>% 
  select(contains("attr")) %>% 
  names() %>% 
  reformulate(termlabels = ., response = "t_default")

xgb_spec <- boost_tree(
  trees = tune(),
  tree_depth = tune(), 
  min_n = tune(), 
  loss_reduction = tune(),                    
  sample_size = tune(), 
  mtry = tune(),         
  learn_rate = tune(),
  stop_iter = tune()
) %>% 
  set_engine("xgboost",
             event_level = "second",
             validation = .2) %>% 
  set_mode("classification")


best_model <- read_csv(paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/",
                              "final_model_xgb_", TRAINING_SUFFIX,
                              "_", RAND_NO_CID_SMALLEST_LARGEST, "_",
                              MODEL_METRIC, "/", "df_optimal_hyperparameters.csv"))

xgb_wf <- workflow() %>%
  add_formula(xgb_formula) %>%
  add_model(xgb_spec)

print("Fitting XGB final model")

tic(msg = "Fitting Final Model")

fit_xgb <- xgb_wf %>%
  finalize_workflow(best_model) %>% 
  fit(df_training_baked)

toc(log = TRUE)
saveRDS(fit_xgb, paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "final_model_xgb_",
                        TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC, "/",
                        "final_model.rds"))

tic(msg = "Getting XGB training fitted values")
v_fitted <- predict(fit_xgb, df_training_baked, type = "prob") %>%
  pull(".pred_1")
toc(log = TRUE)

df_fitted <- df_training_baked %>% 
  transmute(
    cid, 
    qtr,
    qtr_rand_no_cid = paste0(str_replace_all(qtr, "-", ""), "_", rand_no_cid),
    v_fitted
  )

v_unique_qtr_rand_no_cid <- df_fitted %>% 
  pull(qtr_rand_no_cid) %>% 
  unique()

Save_Fitted_Values <- function(df_fitted_values, qtr_rand_no_cids) {
  
  df_fitted_values %>% 
    filter(qtr_rand_no_cid == qtr_rand_no_cids) %>% 
    select(cid, qtr, v_fitted) %>% 
    write_fst(., paste0("../../data/pipeline_outputs/", SPECIAL_SUFFIX, "/", "fitted_training_xgb_",
                        TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC,
                        "/" , "fitted_", qtr_rand_no_cids, ".fst"))
  
}

tic(msg = "Saving XGB Fitted Values")
walk(v_unique_qtr_rand_no_cid, ~ Save_Fitted_Values(df_fitted, .))
toc(log = TRUE)


df_timing_log <- tic.log() %>% 
  unlist() %>%
  tibble(Process = str_extract(., pattern = ".+(?=:)"),
         Time = str_extract(., pattern = "(?<=: ).+")) %>% 
  select(Process, Time)

tic.clear()
tic.clearlog()

write_csv(df_timing_log, paste0("../../data/pipeline_outputs/", 
                                SPECIAL_SUFFIX, "/", "timing_xgb_",
                                TRAINING_SUFFIX, "_", RAND_NO_CID_SMALLEST_LARGEST, "_", MODEL_METRIC, 
                                "/", "model_timing_log.csv"))




